
******** This program generates summary statistics for different samples shown in Table 1 of the paper ***************
******** It also generates data with relevant control variables used in the network analysis robustness check in Table 3 ***************


***************************** SAMPLE SUM STATS ******************************************************

clear all
set more off, permanently

cd "$localdir\Data"
global output "$localdir\Output"


******** Sample: Full population *******

use registerdata19802012_us, clear

*** Merge in the unique identifier used in networks data 
merge m:1 pnr using "Unique ID (pnr).dta"
keep if _merge==3
drop _merge


*** Define variables and sample

* Unemployment shock
bysort pnr (year): g grossunemp_lag=grossunemp[_n-1] if year==year[_n-1]+1
g grossunemp_enter=(grossunemp==1 & grossunemp_lag==0) if grossunemp!=. & grossunemp_lag!=.

* Keep relevant years and variables 
keep if year>=2008 & year<=2012
tab year

keep pnr unique year female age children single grossincome totalassets totaldebt homeowner educ_basic educ_short educ_medium educ_long grossunemp grossunemp_enter

* Save data used for sum stat on other samples and used for control variables in networks analysis
save sumstat_pop, replace

keep if year>=2009 & year<=2012

keep if age>=20 & age<=65

global statvars = "female age children single grossincome totalassets totaldebt homeowner educ_basic educ_short educ_medium educ_long grossunemp grossunemp_enter"


*** Sum stats output
tab year

* Mean
tabstat $statvars, by(year) s(mean n)

* Median over 5 obs - use this as reported median to comply with Statistics Denmark's micro data policies
foreach x of varlist $statvars {
bysort year (`x'): g count=_n if `x'!=.
bysort year (`x'): egen max=max(count) if `x'!=.
g mid=max/2
replace mid=floor(mid)
g upper=mid+2
g lower=mid-2
bysort year (count): egen temp=mean(`x') if count>=lower & count<=upper
bysort year: egen med5_`x'=max(temp)
drop mid upper lower temp count max
}

tabstat med5_female med5_age med5_children med5_single med5_grossincome med5_totalassets med5_totaldebt med5_homeowner med5_educ_basic med5_educ_short med5_educ_medium med5_educ_long med5_grossunemp med5_grossunemp_enter, by(year) s(mean)
* Check
tabstat $statvars, by(year) s(med)

tab year




******** i-sample (survey respondents) ******** 

use sumstat_pop, clear

keep if year>=2009 & year<=2012

* Use only respondents used in the networks analysis
replace year=year+1
merge 1:m unique year using "Unemployment Outcomes RF for Analysis - Last Year.dta", keepusing(unique year prob_unemployed grossunemp_enter_network both ind2_year ocp_year educ_year muni)
keep if _merge==3
drop _merge
replace year=year-1

drop if prob_unemployed==.

duplicates drop unique year, force

* Find regression sample
reghdfe prob_unemployed grossunemp_enter_network, absorb(both ind2_year ocp_year educ_year) cluster(muni)
g sample=(e(sample)==1)

global statvars = "female age children single grossincome totalassets totaldebt homeowner educ_basic educ_short educ_medium educ_long grossunemp grossunemp_enter"

*** Sum stats output
tab year
tab year if sample==1

* Mean
tabstat $statvars if sample==1, by(year) s(mean n)

* Median over 5 obs - use this as reported median to comply with Statistics Denmark's micro data policies
foreach x of varlist $statvars {
bysort year (`x'): g count=_n if `x'!=.
bysort year (`x'): egen max=max(count) if `x'!=.
g mid=max/2
replace mid=floor(mid)
g upper=mid+2
g lower=mid-2
bysort year (count): egen temp=mean(`x') if count>=lower & count<=upper
bysort year: egen med5_`x'=max(temp)
drop mid upper lower temp count max
}

tabstat med5_female med5_age med5_children med5_single med5_grossincome med5_totalassets med5_totaldebt med5_homeowner med5_educ_basic med5_educ_short med5_educ_medium med5_educ_long med5_grossunemp med5_grossunemp_enter if sample==1, by(year) s(mean)
* Check
tabstat $statvars if sample==1, by(year) s(med)

tab year




******** k-sample (2nd degree links) ******** 

use sumstat_pop, clear

keep if year>=2009 & year<=2012

* Use only 2nd degree links used in the networks analysis
rename unique unique_network
replace year=year+1
merge 1:m unique_network year using "Unemployment Outcomes RF for Analysis - Last Year.dta", keepusing(unique year prob_unemployed grossunemp_enter_network both ind2_year ocp_year educ_year muni)
keep if _merge==3
drop _merge
replace year=year-1

drop if prob_unemployed==.

duplicates drop unique_network year, force

* Find regression sample
reghdfe prob_unemployed grossunemp_enter_network, absorb(both ind2_year ocp_year educ_year) cluster(muni)
g sample=(e(sample)==1)

global statvars = "female age children single grossincome totalassets totaldebt homeowner educ_basic educ_short educ_medium educ_long grossunemp grossunemp_enter"

*** Sum stats

tab year 
tab year if sample==1

* Mean
tabstat $statvars if sample==1, by(year) s(mean n)

* Median over 5 obs - use this as reported median to comply with Statistics Denmark's micro data policies
foreach x of varlist $statvars {
bysort year (`x'): g count=_n if `x'!=.
bysort year (`x'): egen max=max(count) if `x'!=.
g mid=max/2
replace mid=floor(mid)
g upper=mid+2
g lower=mid-2
bysort year (count): egen temp=mean(`x') if count>=lower & count<=upper
bysort year: egen med5_`x'=max(temp)
drop mid upper lower temp count max
}

tabstat med5_female med5_age med5_children med5_single med5_grossincome med5_totalassets med5_totaldebt med5_homeowner med5_educ_basic med5_educ_short med5_educ_medium med5_educ_long med5_grossunemp med5_grossunemp_enter if sample==1, by(year) s(mean)
* Check
tabstat $statvars if sample==1, by(year) s(med)

tab year




******** j-sample (1st degree links) ******** 

use sumstat_pop, clear

keep if year>=2009 & year<=2012

* Use only 1st degree links used in the networks analysis
rename unique unique_network
replace year=year+1
merge 1:m unique_network year using "First stage for Analysis.dta", keepusing(unique year prob_unemployed grossunemp_enter_network both ind2_year ocp_year educ_year muni)
keep if _merge==3
drop _merge
replace year=year-1

drop if prob_unemployed==.

duplicates drop unique_network year, force

* Find regression sample
reghdfe prob_unemployed grossunemp_enter_network, absorb(both ind2_year ocp_year educ_year) cluster(muni)
g sample=(e(sample)==1)

global statvars = "female age children single grossincome totalassets totaldebt homeowner educ_basic educ_short educ_medium educ_long grossunemp grossunemp_enter"

*** Sum stats

tab year
tab year if sample==1

* Mean
tabstat $statvars if sample==1, by(year) s(mean n)

* Median over 5 obs - use this as reported median to comply with Statistics Denmark's micro data policies
foreach x of varlist $statvars {
bysort year (`x'): g count=_n if `x'!=.
bysort year (`x'): egen max=max(count) if `x'!=.
g mid=max/2
replace mid=floor(mid)
g upper=mid+2
g lower=mid-2
bysort year (count): egen temp=mean(`x') if count>=lower & count<=upper
bysort year: egen med5_`x'=max(temp)
drop mid upper lower temp count max
}

tabstat med5_female med5_age med5_children med5_single med5_grossincome med5_totalassets med5_totaldebt med5_homeowner med5_educ_basic med5_educ_short med5_educ_medium med5_educ_long med5_grossunemp med5_grossunemp_enter if sample==1, by(year) s(mean)
* Check
tabstat $statvars if sample==1, by(year) s(med)

tab year








****************** Dataset with control variables and variables for balance test *************
* Lag all register variables 2 years, except for unemployment shock; this should be lagged 1 year

*** General control variables 
* 2y lag
use sumstat_pop, clear

replace year=year+2

drop grossunemp_enter

keep if year>=2010 & year<=2013
tab year

save sumstat_2ylag, replace

* 1y lag
use sumstat_pop, clear

keep unique year grossunemp_enter

replace year=year+1

keep if year>=2010 & year<=2013
tab year

save sumstat_1ylag, replace

